# 数据处理完整思路
# 1:载入需要的数据。原始数据中除抑郁问卷外,还包括焦虑问卷,因此只载入测量抑郁的问卷,及人口学信息。具体为DSRSC、CDI、PHQ-9、和DASS-21中测量抑郁的题目。
# 2:转换数据为数值型。
# 3:处理缺失值(实际没有缺失值)。
# 4:原始数据-1,用以方便计算检出率。原始数据的范围与量表正常的数据范围是不一致的。例如:DSRSC量表的计分范围正常应该为0-2,检出率为≥15。但咱们收到的原始数据计分范围就是1-3,也就是所有数值都+了1。因此-1后才能计算正确的检出率。
# 5:描述统计,原始数据的年龄范围显示为0-29岁,因此参考李医生的建议,筛除了7岁以下和18岁以上的人。
# 计算了1、性别比例。2、年龄范围。
# 6:数据质量检查。首先,使用原始数据进行了相关分析,用以检查数据。
# 发现问题:1、DSRSC问卷内相关有正有负。2、CDI问卷内相关有正有负。因此,数据应该进行反向计分,与李医生讨论过确实需要进行反向计分。
# 7:反向计分,以及反向计分后的相关分析。
# 8: 检验问卷内部合并的及跨问卷测量同一个症状的相关系数是否大于其他的
# 9:检出率计算




# 载入必要的包
library(here)
## here() starts at D:/心理健康测量/MH_CPL/5_Analysis/5_2_Measurement data analysis
library(bruceR)
## 
## bruceR (v2023.9)
## Broadly Useful Convenient and Efficient R functions
## 
## Packages also loaded:
## ✔ data.table ✔ emmeans
## ✔ dplyr      ✔ lmerTest
## ✔ tidyr      ✔ effectsize
## ✔ stringr    ✔ performance
## ✔ ggplot2    ✔ interactions
## 
## Main functions of `bruceR`:
## cc()             Describe()  TTEST()
## add()            Freq()      MANOVA()
## .mean()          Corr()      EMMEANS()
## set.wd()         Alpha()     PROCESS()
## import()         EFA()       model_summary()
## print_table()    CFA()       lavaan_summary()
## 
## For full functionality, please install all dependencies:
## install.packages("bruceR", dep=TRUE)
## 
## Online documentation:
## https://psychbruce.github.io/bruceR
## 
## To use this package in publications, please cite:
## Bao, H.-W.-S. (2023). bruceR: Broadly useful convenient and efficient R functions (Version 2023.9) [Computer software]. https://CRAN.R-project.org/package=bruceR
## 
## NEWS: A new version of bruceR (2024.6) is available (2024-06-13)!
## 
## ***** Please update *****
## install.packages("bruceR", dep=TRUE)
## 
## These packages are dependencies of `bruceR` but not installed:
## - cowplot, ggtext, see, lmtest, vars, phia, BayesFactor, GPArotation
## 
## ***** Install all dependencies *****
## install.packages("bruceR", dep=TRUE)
## 
## 载入程序包:'bruceR'
## The following object is masked _by_ 'package:data.table':
## 
##     %notin%
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ Matrix::expand()      masks tidyr::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ Matrix::pack()        masks tidyr::pack()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ Matrix::unpack()      masks tidyr::unpack()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openxlsx)
## Warning: 程序包'openxlsx'是用R版本4.4.1 来建造的
library(ggcorrplot)
## Warning: 程序包'ggcorrplot'是用R版本4.4.1 来建造的
#1.载入数据
raw_data <- bruceR::import(here::here("data", "Rawdata1.xlsx"))
## New names:
## • `量表ID` -> `量表ID...6`
## • `量表名称` -> `量表名称...7`
## • `量表ID` -> `量表ID...49`
## • `量表ID` -> `量表ID...69`
## • `量表ID` -> `量表ID...80`
## • `量表名称` -> `量表名称...81`
## • `量表ID` -> `量表ID...89`
## • `量表ID` -> `量表ID...98`
#筛选与抑郁有关的数据
selected_data <- raw_data %>%
  select(就诊卡号, 性别, 年龄, 
         DSRSC1:DSRSC18, 
         PHQ1:PHQ9, 
         DASS3, DASS5, DASS10, DASS13, DASS16, DASS17, DASS21, 
         CDI1:CDI27)

#2.转换数据类型
selected_data_numeric <- selected_data %>%
  mutate(性别 = factor(性别, levels = c("男", "女"), labels = c(1, 2))) %>%
  mutate(across(matches("性别|年龄|DSRSC|PHQ|DASS|CDI"), as.numeric))

#3.处理缺失值
clean_data <- selected_data_numeric %>%
  drop_na()

#4.原始数据-1,用以方便计算检出率
transformed_data <- clean_data %>%
  mutate(across(matches("DSRSC|PHQ|DASS|CDI"), ~ .x - 1))

#5.筛选出年龄在7岁以上且不超过18岁的数据
filtered_data <- transformed_data %>%
  filter(年龄 >= 7 & 年龄 <= 18)

# 描述统计
# 计算性别比例
gender_proportion <- filtered_data %>%
  summarise(
    male = sum(性别 == 1, na.rm = TRUE),
    female = sum(性别 == 2, na.rm = TRUE),
    total = n(),
    male_proportion = male / total * 100,
    female_proportion = female / total * 100
  )

# 打印性别比例
print(gender_proportion)
##   male female total male_proportion female_proportion
## 1 6704   6060 12764        52.52272          47.47728
# 计算年龄的范围
age_range <- filtered_data %>%
  summarise(
    min_age = min(年龄, na.rm = TRUE),
    max_age = max(年龄, na.rm = TRUE)
  )

# 打印年龄范围
print(age_range)
##   min_age max_age
## 1       7      18
# 检查并创建输出目录
output_dir <- here::here("output")
if (!dir.exists(output_dir)) {
  dir.create(output_dir)
}
# 6.数据质量检查,反向计分前的相关分析
correlation_matrix <- filtered_data %>%
  select(matches("DSRSC|PHQ|DASS|CDI")) %>%
  cor()

# 导出
output_file <- here::here("output", "correlation_matrix.xlsx")
write.xlsx(correlation_matrix, output_file)

# 可视化
p <- ggcorrplot(correlation_matrix, lab = TRUE, 
                method = "circle", 
                outline.color = "white", 
                colors = c("blue", "white", "red"), 
                ggtheme = ggplot2::theme_minimal(base_family = "sans"), 
                title = "未反向计分的相关系数") +
  theme_minimal(base_family = "sans") + 
  theme(
    text = element_text(size = 20, color = "black"), 
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14), 
    axis.text.y = element_text(size = 14), 
    axis.title = element_text(size = 22), 
    plot.background = element_rect(fill = "gray90"), 
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(), 
    panel.background = element_rect(fill = "gray90", color = NA), 
    legend.position = "bottom", 
    legend.key.width = unit(4, "cm"), 
    legend.key.height = unit(1, "cm")
  ) +
  scale_size_continuous(range = c(5, 12)) +
  labs(x = "", y = "")
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
# 保存图片
ggsave(here::here("output", "correlation_heatmap.png"), 
       plot = p, width = 32, height = 28, dpi = 300)
反向计分前的相关分析
反向计分前的相关分析
# 7.进行反向计分
# CDI 反向计分项目
CDI_reverse_items <- c("CDI2", "CDI5", "CDI7", "CDI8", "CDI10", 
                       "CDI11", "CDI13", "CDI15", "CDI16", 
                       "CDI18", "CDI21", "CDI24", "CDI25")

# DSRSC 反向计分项目
DSRSC_reverse_items <- c("DSRSC3", "DSRSC5", "DSRSC6", "DSRSC10", 
                         "DSRSC14", "DSRSC15", "DSRSC17", "DSRSC18")

# 对CDI和DSRSC项目进行反向计分
reversed_data <- filtered_data %>%
  mutate(across(all_of(CDI_reverse_items), ~ 2 - .)) %>%  # 对于0-2范围,反向计分为2减去原分数
  mutate(across(all_of(DSRSC_reverse_items), ~ 2 - .))

# 计算反向计分后的相关性
correlation_matrix_reversed <- reversed_data %>%
  select(matches("DSRSC|PHQ|DASS|CDI")) %>%
  cor()

# 导出
output_file_reversed <- here::here("output", "correlation_matrix_reversed.xlsx")
write.xlsx(correlation_matrix_reversed, output_file_reversed)

# 可视化
p_reversed <- ggcorrplot(correlation_matrix_reversed, lab = TRUE, 
                         method = "circle", 
                         outline.color = "white", 
                         colors = c("blue", "white", "red"), 
                         ggtheme = ggplot2::theme_minimal(base_family = "sans"), 
                         title = "反向计分后的相关系数") +
  theme_minimal(base_family = "sans") + 
  theme(
    text = element_text(size = 20, color = "black"), 
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14), 
    axis.text.y = element_text(size = 14), 
    axis.title = element_text(size = 22), 
    plot.background = element_rect(fill = "gray90"), 
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(), 
    panel.background = element_rect(fill = "gray90", color = NA), 
    legend.position = "bottom", 
    legend.key.width = unit(4, "cm"), 
    legend.key.height = unit(1, "cm")
  ) +
  scale_size_continuous(range = c(5, 12)) +
  labs(x = "", y = "")
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
# 保存图片
ggsave(here::here("output", "correlation_heatmap_reversed.png"), 
       plot = p_reversed, width = 32, height = 28, dpi = 300)
反向计分后的相关分析
反向计分后的相关分析
# 使用BOOT法计算DSRSC 2和14、10和12的区间,并且比较其他相关系数的区间。

cor_Q2_Q14 <- cor(reversed_data$DSRSC2, reversed_data$DSRSC14)
cor_Q10_Q12 <- cor(reversed_data$DSRSC10, reversed_data$DSRSC12)

# 计算其他相关系数
cor_matrix <- cor(reversed_data[, grep("DSRSC", names(reversed_data))])
other_cor <- as.vector(cor_matrix[upper.tri(cor_matrix)])

# Bootstrap函数
bootstrap_ci <- function(data1, data2, n_iterations = 1000) {
  # 存储相关系数
  cor_vals <- numeric(n_iterations)
  for (i in 1:n_iterations) {
    # 重采样
    sample_indices <- sample(1:length(data1), length(data1), replace = TRUE)
    cor_vals[i] <- cor(data1[sample_indices], data2[sample_indices])
  }
  # 返回置信区间
  return(quantile(cor_vals, probs = c(0.025, 0.975)))
}

# 计算Q2与Q14的置信区间
ci_Q2_Q14 <- bootstrap_ci(reversed_data$DSRSC2, reversed_data$DSRSC14)

# 计算Q10与Q12的置信区间
ci_Q10_Q12 <- bootstrap_ci(reversed_data$DSRSC10, reversed_data$DSRSC12)

# 计算其他相关系数的均值和置信区间
mean_other_cor <- mean(other_cor)
ci_other_cor <- quantile(other_cor, probs = c(0.025, 0.975))

# 打印结果

cat("Q2与Q14的置信区间:", ci_Q2_Q14, "\n")
## Q2与Q14的置信区间: 0.318588 0.3508647
cat("Q10与Q12的置信区间:", ci_Q10_Q12, "\n")
## Q10与Q12的置信区间: 0.4345954 0.4620834
cat("其他相关系数置信区间:", ci_other_cor, "\n")
## 其他相关系数置信区间: 0.1721605 0.5507313
# 8.检验问卷内部合并的症状的相关系数是否大于其他的。
# DSRSC合并了2和14、10和12。
# CDIQ3Q7Q24合并 Q15Q23合并。
# 先做每个相关值和其他值的单样本T检验,再做不同组(合并组/其他组)的独立样本T检验。


# 定义一个通用函数进行逐个比较的检验
test_correlation <- function(vars, data, cor_matrix) {
  results <- data.frame(variable1 = character(), variable2 = character(), correlation = numeric(), p_value = numeric(), stringsAsFactors = FALSE)

  for (pair in vars) {
    cor_value <- cor(data[[pair[1]]], data[[pair[2]]])
    
    # 计算其他相关系数
    other_cor <- as.vector(cor_matrix[upper.tri(cor_matrix)])
    other_cor <- other_cor[other_cor != cor_value]  # 排除当前相关系数
    mean_other_cor <- mean(other_cor)
    
    # t检验
    t_test_result <- (cor_value - mean_other_cor) / (sd(other_cor) / sqrt(length(other_cor)))
    df <- length(other_cor) - 1
    p_value <- 1 - pt(t_test_result, df)
    
    # 存储结果
    results <- rbind(results, data.frame(variable1 = pair[1], variable2 = pair[2], correlation = cor_value, p_value = p_value))
  }
  
  return(results)
}

# 进行逐个比较的主函数
run_analysis <- function(pairs, data, cor_matrix, label) {
  results <- test_correlation(pairs, data, cor_matrix)
  print(results)
  
  first_class_cor <- sapply(pairs, function(pair) cor(data[[pair[1]]], data[[pair[2]]]))
  mean_first_class_cor <- mean(first_class_cor)
  
  other_cor <- as.vector(cor_matrix[upper.tri(cor_matrix)])
  mean_other_cor <- mean(other_cor)
  
  t_test_result <- t.test(first_class_cor, other_cor, alternative = "greater")
  
  cat(label, "第一类相关系数均值:", mean_first_class_cor, "\n")
  cat(label, "其他相关系数均值:", mean_other_cor, "\n")
  cat(label, "t检验结果:\n")
  print(t_test_result)
}

# DSRSC变量组合
pairs_DSRSC <- list(
  c("DSRSC2", "DSRSC14"),
  c("DSRSC10", "DSRSC12")
)

# 执行DSRSC分析
run_analysis(pairs_DSRSC, reversed_data, cor_matrix, "DSRSC")
##   variable1 variable2 correlation   p_value
## 1    DSRSC2   DSRSC14   0.3351449 0.9999382
## 2   DSRSC10   DSRSC12   0.4477969 0.0000000
## DSRSC 第一类相关系数均值: 0.3914709 
## DSRSC 其他相关系数均值: 0.3679623 
## DSRSC t检验结果:
## 
##  Welch Two Sample t-test
## 
## data:  first_class_cor and other_cor
## t = 0.41288, df = 1.0442, p-value = 0.3742
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.3112526        Inf
## sample estimates:
## mean of x mean of y 
## 0.3914709 0.3679623
# CDI变量组合
pairs_CDI <- list(
  c("CDI3", "CDI7"),
  c("CDI3", "CDI24"),
  c("CDI7", "CDI24"),
  c("CDI15", "CDI23")
)

# 执行CDI分析
run_analysis(pairs_CDI, reversed_data, cor_matrix, "CDI")
##   variable1 variable2 correlation      p_value
## 1      CDI3      CDI7   0.3713427 3.426870e-01
## 2      CDI3     CDI24   0.3471093 9.933328e-01
## 3      CDI7     CDI24   0.4986589 0.000000e+00
## 4     CDI15     CDI23   0.4200371 1.938337e-09
## CDI 第一类相关系数均值: 0.409287 
## CDI 其他相关系数均值: 0.3679623 
## CDI t检验结果:
## 
##  Welch Two Sample t-test
## 
## data:  first_class_cor and other_cor
## t = 1.1996, df = 3.3837, p-value = 0.1538
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.03613285         Inf
## sample estimates:
## mean of x mean of y 
## 0.4092870 0.3679623
# 8.1 检验问卷间测量同一个症状的相关系数是否大于其他的。
# 测量相同症状的有 1.DSRSC16 PHQ2 2.CDI1 DSRSC17 3.CDI4 DASS3
# 4.CDI11 DSRSC18 5.CDI20 DSRSC15 6.DSRSC11 PHQ6 7.CDI2 DSRSC1 DASS10
# 8.CDI3 7 24  DASS17 9.DSRSC7 DASS5 10.DSRSC10 12 PHQ1 11.CDI17 PHQ4 12.CDI18 PHQ5
# 13. CDI16 PHQ3 14.CDI12 DSRSC4 15.CDI10 DSRSC3 16.CDI9 PHQ9
# 先做每个相关值和其他值的单样本T检验,再做不同组(合并组/其他组)的独立样本T检验。


# 逐个进行比较
# 定义一个函数进行检验
test_correlation <- function(var1, var2) {
  cor_value <- cor(reversed_data[[var1]], reversed_data[[var2]])
  other_cor <- as.vector(cor_matrix[upper.tri(cor_matrix)])  # 计算其他相关系数
  mean_other_cor <- mean(other_cor)
  
  t_test_result <- (cor_value - mean_other_cor) / (sd(other_cor) / sqrt(length(other_cor)))
  df <- length(other_cor) - 1
  p_value <- 1 - pt(t_test_result, df)
  
  return(data.frame(variable1 = var1, variable2 = var2, correlation = cor_value, p_value = p_value))
}

# 所有组合的变量
pairs <- list(
  c("DSRSC16", "PHQ2"),
  c("CDI1", "DSRSC17"),
  c("CDI4", "DASS3"),
  c("CDI11", "DSRSC18"),
  c("CDI20", "DSRSC15"),
  c("DSRSC11", "PHQ6"),
  c("CDI2", "DSRSC1"),
  c("CDI2", "DASS10"),
  c("DSRSC1", "DASS10"),
  c("CDI3", "DASS17"),
  c("CDI7", "DASS17"),
  c("CDI24", "DASS17"),
  c("DSRSC7", "DASS5"),
  c("DSRSC10", "PHQ1"),
  c("DSRSC12", "PHQ1"),
  c("CDI17", "PHQ4"),
  c("CDI18", "PHQ5"),
  c("CDI16", "PHQ3"),
  c("CDI12", "DSRSC4"),
  c("CDI10", "DSRSC3"),
  c("CDI9", "PHQ9")
)

# 进行检验并存储结果
results <- do.call(rbind, lapply(pairs, function(x) test_correlation(x[1], x[2])))

# 打印结果
print(results)
##    variable1 variable2 correlation      p_value
## 1    DSRSC16      PHQ2   0.3767166 1.474149e-01
## 2       CDI1   DSRSC17   0.5718628 0.000000e+00
## 3       CDI4     DASS3   0.5651490 0.000000e+00
## 4      CDI11   DSRSC18   0.4570984 0.000000e+00
## 5      CDI20   DSRSC15   0.7199515 0.000000e+00
## 6    DSRSC11      PHQ6   0.4172825 1.019307e-08
## 7       CDI2    DSRSC1   0.2533121 1.000000e+00
## 8       CDI2    DASS10   0.3137605 1.000000e+00
## 9     DSRSC1    DASS10   0.4462108 0.000000e+00
## 10      CDI3    DASS17   0.3496432 9.853333e-01
## 11      CDI7    DASS17   0.6389790 0.000000e+00
## 12     CDI24    DASS17   0.4561890 0.000000e+00
## 13    DSRSC7     DASS5   0.3443189 9.974284e-01
## 14   DSRSC10      PHQ1   0.5032363 0.000000e+00
## 15   DSRSC12      PHQ1   0.4525730 0.000000e+00
## 16     CDI17      PHQ4   0.6425887 0.000000e+00
## 17     CDI18      PHQ5   0.4990844 0.000000e+00
## 18     CDI16      PHQ3   0.6379242 0.000000e+00
## 19     CDI12    DSRSC4   0.4370466 2.686740e-14
## 20     CDI10    DSRSC3   0.6719539 0.000000e+00
## 21      CDI9      PHQ9   0.7355485 0.000000e+00
# 将他们看成一个组进行比较
# 特定的跨问卷相关系数
cross_questionnaire_pairs <- list(
  c("DSRSC16", "PHQ2"),
  c("CDI1", "DSRSC17"),
  c("CDI4", "DASS3"),
  c("CDI11", "DSRSC18"),
  c("CDI20", "DSRSC15"),
  c("DSRSC11", "PHQ6"),
  c("CDI2", "DSRSC1"),
  c("CDI2", "DASS10"),
  c("DSRSC1", "DASS10"),
  c("CDI3", "DASS17"),
  c("CDI7", "DASS17"),
  c("CDI24", "DASS17"),
  c("DSRSC7", "DASS5"),
  c("DSRSC10", "PHQ1"),
  c("DSRSC12", "PHQ1"),
  c("CDI17", "PHQ4"),
  c("CDI18", "PHQ5"),
  c("CDI16", "PHQ3"),
  c("CDI12", "DSRSC4"),
  c("CDI10", "DSRSC3"),
  c("CDI9", "PHQ9")
)

# 获取特定的相关系数
cross_correlations <- sapply(cross_questionnaire_pairs, function(pair) {
  cor(reversed_data[[pair[1]]], reversed_data[[pair[2]]])
})

# 获取其他相关系数
other_correlations <- as.vector(cor_matrix[upper.tri(cor_matrix)])

# 计算均值
mean_other_cor <- mean(other_correlations)

# t检验
t_test_result <- (mean(cross_correlations) - mean_other_cor) / (sd(other_correlations) / sqrt(length(other_correlations)))

# 计算自由度
df <- length(other_correlations) - 1

# 打印结果
cat("特定相关系数均值:", mean(cross_correlations), "\n")
## 特定相关系数均值: 0.4995443
cat("其他相关系数均值:", mean_other_cor, "\n")
## 其他相关系数均值: 0.3679623
cat("t值:", t_test_result, "\n")
## t值: 15.80038
# 计算p值
p_value <- 2 * pt(-abs(t_test_result), df)
cat("p值:", p_value, "\n")
## p值: 6.908295e-34
# 9.计算检出率,并且可视化。
# 计算DSRSC和PHQ-9的
# 设置阈值
cutoff_DSRSC <- 15
cutoff_PHQ <- 10

# 计算DSRSC和PHQ的总分
total_DSRSC_scores <- rowSums(reversed_data[, grep("^DSRSC", names(reversed_data))])
total_PHQ_scores <- rowSums(reversed_data[, grep("^PHQ", names(reversed_data))])

# 计算超过cutoff的人数
detected_DSRSC <- sum(total_DSRSC_scores >= cutoff_DSRSC)
detected_PHQ <- sum(total_PHQ_scores >= cutoff_PHQ)

# 计算总样本数量
total_count <- nrow(reversed_data)

# 计算检出率
detection_rate_DSRSC <- (detected_DSRSC / total_count) * 100
detection_rate_PHQ <- (detected_PHQ / total_count) * 100

# 打印结果
cat("DSRSC总分超过cutoff的人数:", detected_DSRSC, "\n")
## DSRSC总分超过cutoff的人数: 6652
cat("DSRSC检出率:", detection_rate_DSRSC, "%\n")
## DSRSC检出率: 52.11532 %
cat("PHQ总分超过cutoff的人数:", detected_PHQ, "\n")
## PHQ总分超过cutoff的人数: 6823
cat("PHQ检出率:", detection_rate_PHQ, "%\n")
## PHQ检出率: 53.45503 %
# 计算CDI的,排除第26题
cutoff_CDI <- 20  
total_CDI_scores <- rowSums(reversed_data[, grep("^CDI", names(reversed_data))]) - reversed_data$CDI26

# 计算超过cutoff的人数
detected_CDI <- sum(total_CDI_scores >= cutoff_CDI)

# 计算检出率
detection_rate_CDI <- (detected_CDI / total_count) * 100

# 打印结果
cat("CDI总分超过cutoff的人数:", detected_CDI, "\n")
## CDI总分超过cutoff的人数: 6286
cat("CDI检出率:", detection_rate_CDI, "%\n")
## CDI检出率: 49.24788 %
# 创建检出率数据框
detection_rates <- data.frame(
  Measure = c("DSRSC", "PHQ-9", "CDI"),
  Detection_Rate = c(detection_rate_DSRSC, detection_rate_PHQ, detection_rate_CDI)
)


# 绘制图形
ggplot(detection_rates, aes(x = Measure, y = Detection_Rate, fill = Measure)) +
  geom_bar(stat = "identity", color = "black") +
  scale_fill_manual(values = c("#1f77b4", "#ff7f0e", "#2ca02c")) + 
  labs(x = "Measure", y = "Detection Rate (%)") +
  theme_minimal(base_size = 15) + # 设置基础字体大小
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "white"), # 设置面板背景为白色
    plot.background = element_rect(fill = "white"), # 设置整个图背景为白色
    plot.title = element_blank(),
    legend.position = "none"
  ) +
  geom_text(aes(label = round(Detection_Rate, 2)), vjust = -0.5)

# 保存图像到figure文件夹
ggsave(here::here("figure", "detection_rates_plot.png"), width = 8, height = 5, dpi = 300)